# -*- coding: utf-8 -*-
import scrapy
import re
import pymysql
from scrapy.http import Request, FormRequest
from scrapy.utils.project import get_project_settings

class BabasuperSpider(scrapy.Spider):
    name = "babasuper"
    allowed_domains = ["www.babasuper.com"]
    start_urls = [
        'http://www.babasuper.com/port/trucksource.shtml',
        'http://www.babasuper.com/port/goodssource.shtml',
    ]

    headers = {
        "Accept": "*/*",
        "Accept-Encoding": "gzip,deflate",
        "Accept-Language": "en-US,en;q=0.8,zh-TW;q=0.6,zh;q=0.4",
        "Connection": "keep-alive",
        "Content-Type":" application/x-www-form-urlencoded; charset=UTF-8",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36",
        "Referer": "http://www.babasuper.com/"
    }

    # 重写了爬虫类的方法, 实现了自定义请求, 运行成功后会调用callback回调函数
    # Scrapy通过使用 cookiejar Request meta key来支持单spider追踪多cookie session。
    # 默认情况下其使用一个cookie jar(session)，不过您可以传递一个标示符来使用多个。
    # 也就是说 meta cookiejar 的值可以随意指定
    def start_requests(self):
        return [Request("http://www.babasuper.com/port/login.jsp", meta = {'cookiejar':1}, callback = self.post_login)]

    #FormRequeset出问题了
    def post_login(self, response):
        print('Preparing login')
        #FormRequeset 是Scrapy提供的一个函数, 用于post表单
        #登陆成功后, 会调用after_login回调函数
        # print(response.meta)
        settings = get_project_settings()
        return [ FormRequest(
                "http://www.babasuper.com/port/smsLogin.shtml",
                method='POST',
                meta = {'cookiejar' : response.meta['cookiejar']}, #注意这里cookie的获取
                headers = self.headers,  #注意此处的headers
                formdata = {
                    'mobile': settings['BABASUPER']['mobile'],
                    'password': settings['BABASUPER']['password'],
                },
                callback = self.after_login,
                dont_filter = True,
            )]

    def after_login(self, response) :
        print("after login")
        # print(response.meta)
        for url in self.start_urls :
            # yield Request(url, meta={'cookiejar':response.meta['cookiejar']})
            if 'trucksource' in url :
                max_page_num_plus1 = 1710 + 1
                for page in range(1, max_page_num_plus1) :
                    yield Request(url + "?page=" + str(page), meta={'cookiejar': response.meta['cookiejar']})
            elif 'goodssource' in url :
                max_page_num_plus1 = 2350 + 1
                for page in range(1, max_page_num_plus1) :
                    yield Request(url + "?page=" + str(page), meta={'cookiejar': response.meta['cookiejar']})

    def parse(self, response) :
        print("parse")

        if 'trucksource' in response.url :
            print('trucksource')
            list1 = response.selector.xpath('//div[@class="show-list"]/ul/li[@class="in-show-li1 clearfix"]/a/div').extract()
            # 正则替换html标签
            pattern = re.compile(r'<.*?>')
            list2   = [pattern.sub('', str).strip() for str in list1]
            # 将一位数转换成二维数组
            colNum  = 7
            rowNum  = len(list2)/colNum
            list3   = iter(list2)
            list4   = [[next(list3) for _ in range(colNum)] for _ in range(int(rowNum))]

            # pass
            url_list = response.selector.xpath('//div[@class="show-list"]/ul/li[@class="in-show-li1 clearfix"]/a/@href').extract()

            # 将 url 合并到 truck 里面
            list5    = [truck+[url] for truck, url in zip(list4, url_list)]
            # print(list5)
            # return list5
            for item in list5 :
                yield Request(item[colNum], meta={'cookiejar':response.meta['cookiejar'], 'item':item}, callback=self.parse_truck_details)
        elif 'goodssource' in response.url :
            print('goodssource')
            list1 = response.selector.xpath('//div[@class="show-list"]/ul/li[@class="in-show-li1 clearfix"]/a/div').extract()
            # 正则替换html标签
            pattern = re.compile(r'<.*?>')
            list2   = [pattern.sub('', str).strip() for str in list1]
            # 将一位数转换成二维数组
            colNum  = 6
            rowNum  = len(list2)/colNum
            list3   = iter(list2)
            list4   = [[next(list3) for _ in range(colNum)] for _ in range(int(rowNum))]

            # pass
            url_list = response.selector.xpath('//div[@class="show-list"]/ul/li[@class="in-show-li1 clearfix"]/a/@href').extract()

            # 将 url 合并到 truck 里面
            list5    = [truck+[url] for truck, url in zip(list4, url_list)]
            # print(list5)
            # return list5
            for item in list5 :
                yield Request(item[colNum], meta={'cookiejar':response.meta['cookiejar'], 'item':item}, callback=self.parse_goods_details)

    # 处理车辆数据
    def parse_truck_details(self, response) :
        print('parse_truck_details')
        item = response.meta['item']
        # print(item);
        car_type         = response.selector.xpath('//li[@class="int-li2"]/div[@class="int-small-box1 fl"]/i/text()').extract()
        car_length       = response.selector.xpath('//li[@class="int-li2"]/div[@class="int-small-box2 fl"]/i/text()').extract()

        plate_number_all = response.selector.xpath('//li[@class="int-li3"]/div[@class="int-small-box1 fl"]/i/text()').extract()
        remark           = response.selector.xpath('//li[@class="int-li3"]/div[@class="int-small-box2 fl"]/i/text()').extract()

        contact          = response.selector.xpath('//div[@class="user-con fl"]/div[@class="user-name"]/i/text()').extract()

        contact_idcard   = response.selector.xpath('//div[@class="user-con fl"]/div[@class="user-id"]/i/text()').extract()

        start_date       = response.selector.xpath('//div[@class="article1-top"]/div[@class="fl"]/i/text()').extract()

        from_city        = response.selector.xpath('//div[@class="ve-add1 fl"]/p/text()').extract()

        to_city          = response.selector.xpath('//div[@class="ve-add3 fl"]/p/text()').extract()


        truck = {}
        truck['url']              = response.url # 数据源URL
        truck['plate_number_all'] = plate_number_all[0] if len(plate_number_all)>0 else '' # '车牌号：冀e7g495',
        truck['load']             = item[1] # '车载重量体积',
        truck['from_city']        = from_city[1] if from_city[0]=='不限' else from_city[0] # '发车城市',
        truck['to_city']          = to_city[1] if to_city[0]=='不限' else to_city[0] # '期望去向城市',
        truck['start_date']       = start_date[0] if len(start_date)>0 else '' # '发车时间',
        truck['contact_name']     = contact[0] if 2==len(contact) else '' # '联系人名称',
        truck['contact_phone']    = contact[1] if 2==len(contact) else contact[0] # '联系人电话',
        truck['contact_idcard']   = contact_idcard[0] if len(contact_idcard)>0 else '' # '联系人身份证',
        truck['car_type']         = car_type[0] if len(car_type)>0 else '' # '车辆车型： 高栏车 平板车 低栏车 等',
        truck['car_length']       = car_length[0] if len(car_length)>0 else '' # '车长',
        truck['remark']           = remark[0] if len(remark)>0 else '' # 备注

        settings = get_project_settings()
        insert = "REPLACE INTO `"+settings['MYSQL_DB']['prefix']+"truck` (`plate_number_all`, `load`, `from_city`, `to_city`, `start_date`, `car_type`, `car_length`, `contact_name`, `contact_idcard`, `contact_phone`, `url`, `remark`) VALUES ('"+truck['plate_number_all']+"', '"+truck['load']+"', '"+truck['from_city']+"', '"+truck['to_city']+"', '"+truck['start_date']+"', '"+truck['car_type']+"', '"+truck['car_length']+"', '"+truck['contact_name']+"', '"+truck['contact_idcard']+"', '"+truck['contact_phone']+"', '"+truck['url']+"', '"+truck['remark']+"')"

        db = pymysql.connect(settings['MYSQL_DB']['host'], settings['MYSQL_DB']['user'], settings['MYSQL_DB']['password'], settings['MYSQL_DB']['name'])
        cursor = db.cursor()
        try:
            # 执行sql语句
            cursor.execute(insert.encode('utf8'))
            # 提交到数据库执行
            db.commit()
        except:
            # 如果发生错误则回滚
            db.rollback()

    # 处理货物数据
    def parse_goods_details(self, response) :
        print('parse_goods_details')
        item = response.meta['item']
        # print(item);
        volume         = response.selector.xpath('//li[@class="int-li2"]/div[@class="int-small-box1 fl"]/i/text()').extract()
        weight         = response.selector.xpath('//li[@class="int-li2"]/div[@class="int-small-box2 fl"]/i/text()').extract()
        car_length     = response.selector.xpath('//li[@class="int-li3"]/div[@class="int-small-box1 fl"]/i/text()').extract()
        car_type       = response.selector.xpath('//li[@class="int-li3"]/div[@class="int-small-box2 fl"]/i/text()').extract()
        freight        = car_type[1] if 2==len(car_type) else (car_type[0] if 1==len(car_type) else '')

        contact        = response.selector.xpath('//div[@class="user-con fl"]/div[@class="user-name"]/i/text()').extract()
        contact_idcard = response.selector.xpath('//div[@class="user-con fl"]/div[@class="user-id"]/i/text()').extract()

        start_date     = response.selector.xpath('//div[@class="article1-top"]/div[@class="fl"]/i/text()').extract()

        from_city      = response.selector.xpath('//div[@class="ve-add1 fl"]/p/text()').extract()
        from_city      = [str for str in from_city if str.strip()!='']

        to_city        = response.selector.xpath('//div[@class="ve-add3 fl"]/p/text()').extract()


        goods = {}
        goods['from_city']      = from_city[1] if from_city[0]=='不限' else from_city[0] # '发车城市',
        goods['to_city']        = to_city[1] if to_city[0]=='不限' else to_city[0] # '期望去向城市',
        goods['start_date']     = start_date[0] if len(start_date)>0 else '' # '发车时间',
        goods['goods_info']     = item[2] # '货物信息',
        goods['freight']        = freight # '运费',
        goods['volume']         = volume[0] if len(volume)>0 else '' # '体积',
        goods['weight']         = weight[0] if len(weight)>0 else '' # '重量',
        goods['car_type']       = car_type[0] if len(car_type)>1 else '' # '需求车辆车型： 高栏车 平板车 低栏车 等',
        goods['car_length']     = car_length[0] if len(car_length)>0 else '' # '需求车长',
        goods['contact_name']   = contact[0] if 2==len(contact) else '' # '联系人名称',
        goods['contact_phone']  = contact[1] if 2==len(contact) else contact[0] # '联系人电话',
        goods['contact_idcard'] = contact_idcard[0] if len(contact_idcard)>0 else '' # '联系人身份证',
        goods['url']            = response.url # 数据源URL
        goods['remark']         = item[4] # 备注

        settings = get_project_settings()
        insert = "REPLACE INTO `"+settings['MYSQL_DB']['prefix']+"goods` ( `from_city`, `to_city`, `start_date`, `goods_info`, `freight`, `volume`, `weight`, `car_type`, `car_length`, `contact_name`, `contact_idcard`, `contact_phone`, `url`, `remark`) VALUES ('"+goods['from_city']+"', '"+goods['to_city']+"', '"+goods['start_date']+"', '"+goods['goods_info']+"', '"+goods['freight']+"', '"+goods['volume']+"', '"+goods['weight']+"', '"+goods['car_type']+"', '"+goods['car_length']+"', '"+goods['contact_name']+"', '"+goods['contact_idcard']+"', '"+goods['contact_phone']+"', '"+goods['url']+"', '"+goods['remark']+"')"

        db = pymysql.connect(settings['MYSQL_DB']['host'], settings['MYSQL_DB']['user'], settings['MYSQL_DB']['password'], settings['MYSQL_DB']['name'])
        cursor = db.cursor()
        try:
            # 执行sql语句
            cursor.execute(insert.encode('utf8'))
            # 提交到数据库执行
            db.commit()
        except:
            # 如果发生错误则回滚
            db.rollback()

